/****************************************************************************************************
*                                                                                                   *
*                                      Data Processing and Cleaning                                 *
*                                                                                                   *
* This .do file performs the following tasks:                                                       *
* 1. Imports raw baseline data                                                                      *
* 2. Labels variables and choices for survey versions                                               *
* 3. Cleans data by:                                                                                *
*    - Dropping unnecessary variables                                                               *
*    - Recoding variables for analysis                                                              *
* 4. Collapses data at the line-section-team level to merge with ratings data                       *
*                                                                                                   *
****************************************************************************************************/

*------------------------------*
*        Initialization        *
*------------------------------*

set more off
clear all
set segmentsize 3g  /* Setting memory */

*------------------------------*
*    Importing Baseline Data   *
*------------------------------*


* Import data from Stata file
use "$Data/Original/baseline_raw.dta", clear

* Drop identifying variables
drop Enumerator_Name Employee_Name
sort _id

* Keep only relevant variables
keep Years_Schooling SkillGrade EmploymentType Contractor_name ProductionLine Religion Caste political_support_2019 ///
     political_support_2014 NRC_support attitude_neighbour trust_scale donate_choice attitude_marriage_religion ///
     attitude_communication_religion attitude_orders_religion YearJoined EnterEmployeeDateofBirthD /// 
     interaction_cross_rel hm_ratio_wf hm_ratio_nf _id deviceid IAT_score Type_Schooling /// 
     Residence_type_family residence_size attitude_lend_religion

* Order variables
order _id deviceid
destring _id, replace



*------------------------------*
*        Data Cleaning         *
*------------------------------*

** School Years **
destring Years_Schooling, generate(school_years)
replace school_years = 0 if missing(school_years)  /* Missing if no schooling, because question was skipped if respondent said did not go to school */

** Dummies for Skill-Grade **
encode SkillGrade, generate(skill)

** Clean and Encode Employment Type **
assert !mi(EmploymentType)
replace EmploymentType = trim(EmploymentType)
replace Contractor_name = "JJE" if Contractor_name == "JJJE"
replace Contractor_name = "MKE" if inlist(Contractor_name, "MKe", "Mke", "MkE")
encode EmploymentType, generate(employment)

** Clean and Encode Caste and Religion **
assert !mi(Religion)
replace Religion = trim(Religion)
encode Religion, generate(religion)
encode Caste, generate(caste)

** Political Support (Dummy for BJP Support) **
replace political_support_2019 = lower(political_support_2019)
generate pol_support19 = (political_support_2019 == "bjp")
replace pol_support19 = . if inlist(political_support_2019, "", "did not vote", "didnt vote", ///
    "uneligible to vote", "na", "refuse to answer", "refused to answer")

replace political_support_2014 = lower(political_support_2014)
generate pol_support14 = (political_support_2014 == "bjp")

* Corrected line to handle more than 10 arguments in inlist()
replace pol_support14 = . if ///
    inlist(political_support_2014, "", "did not vote", "didnt vote", "ineligible to vote", "na", "n/a", "refuse to answer", "refused to answer") 
	
replace pol_support14 = . if ///	
    inlist(political_support_2014, "age criteria not met", "dont remember", "out of state", "not voted", "not applicable")

** Support for NRC **
generate nrc_support = (NRC_support == "Yes")
replace nrc_support = . if inlist(NRC_support, "", "Dont Know", "Dont Know ", "Refuse to Answer")

** Preference for Non-Coreligionist Neighbour **
generate neighbour_attitude = (attitude_neighbour == "No")
replace neighbour_attitude = 0.5 if attitude_neighbour == "Would not prefer, but would be ok"
replace neighbour_attitude = . if attitude_neighbour == ""

** Generalized Trust (WVS) **
destring trust_scale, generate(trust)

** Altruism (WVS) **
destring donate_choice, generate(donate)
generate ln_donate = log(1 + donate)

** Attitudes Towards Inter-Religious Marriage **
generate marriage_religion = (attitude_marriage_religion == "No")
replace marriage_religion = 0.5 if attitude_marriage_religion == "Would not prefer, but would be ok"
replace marriage_religion = . if inlist(attitude_marriage_religion, "", "Dont Know")

** More comfortable Communicating with Coreligionists **
generate comm_rel = (attitude_communication_religion == "Always")
replace comm_rel = 0.5 if attitude_communication_religion == "Sometimes"
replace comm_rel = . if inlist(attitude_communication_religion, "", "Dont Know")

* Recoding communication variables for positive attitude scaling
local comm_vars comm_rel
foreach var of local comm_vars {
    replace `var' = 2 if `var' == 1
    replace `var' = 1 if `var' == 0
    replace `var' = 0 if `var' == 2
}

** Comfortable Taking Orders from Non-Coreligionists **
generate orders_religion = (attitude_orders_religion == "Always")
replace orders_religion = . if inlist(attitude_orders_religion, "", "Refuse to Answer", "Dont Know")

** Age and Tenure **
generate DOJ = date(YearJoined, "YMD")
format DOJ %td
generate yoj = year(DOJ)
generate tenure = 2019 - yoj
replace tenure = 0.5 if tenure < 1  /* Assign 0.5 if joined in the same year */

** Date of Birth **
generate DOB = date(EnterEmployeeDateofBirthD, "YMD")
format DOB %td
generate yob = year(DOB)
generate age = 2019 - yob

** Cross-Religion Interaction **
generate int_cross = (interaction_cross_rel == "More than 5") & !missing(interaction_cross_rel)
replace int_cross = 0.5 if interaction_cross_rel == "Less than 5"

** H-M Ratio in Neighbourhood **
generate temp = hm_ratio_wf
replace temp = hm_ratio_nf if temp == ""
destring temp, generate(ingroup_outgroup_r) force
drop temp
replace ingroup_outgroup_r = 10 - ingroup_outgroup_r if religion == 2  /* Adjust for Muslims */
generate outgroup_ingroup_r = 10 - ingroup_outgroup_r

** Standardize Baseline Exposure Measures **
foreach var of varlist int_cross outgroup_ingroup_r comm_rel {
    quietly summarize `var', detail
    generate `var'_std = (`var' - r(mean)) / r(sd)
}

** Create Baseline Exposure Index **
icw_index int_cross_std outgroup_ingroup_r_std comm_rel_std, generate(baselineexp_index)
egen baselineexp_index_hindu = rowmean(int_cross_std outgroup_ingroup_r_std comm_rel_std) if religion == 1
label variable baselineexp_index_hindu "Baseline Exposure Index (Hindu)"
egen baselineexp_index_muslim = rowmean(int_cross_std outgroup_ingroup_r_std comm_rel_std) if religion == 2
label variable baselineexp_index_muslim "Baseline Exposure Index (Muslim)"

** IAT Score **
destring IAT_score, generate(iat)

*------------------------------*
*      Labeling Variables      *
*------------------------------*

label variable school_years                    "Till which grade did you study in school?"
label variable tenure                          "Years worked at the factory"
label variable age                             "Age in Years"
label variable trust                           "WVS measure of trust"
label variable ln_donate                       "WVS measure of altruism"
label variable orders_religion                 "Are you comfortable listening to orders from a non-coreligionist?"
label variable neighbour_attitude              "Would you live next to a non-coreligionist? (Hindu/Muslim)"
label variable int_cross                       "Daily interactions with non-religion members (Hindu/Muslim)"
label variable ingroup_outgroup_r              "Outgroup-Ingroup ratio in locality"
label variable baselineexp_index_hindu         "Baseline Exposure Index (Hindu)"
label variable baselineexp_index_muslim        "Baseline Exposure Index (Muslim)"
label variable baselineexp_index               "Baseline Exposure Index"
label variable comm_rel                        "Do you in general find it easier to communicate with co-religionists?"
label variable Type_Schooling                 "Type of school attended"
label variable Residence_type_family          "Currently live with family?"
label variable residence_size                 "Total number of household members"
label variable marriage_religion              "Accept inter-religious marriage for children? (Hindu/Muslim)"
label variable pol_support14                  "Political support in 2014 Indian General Elections"
label variable pol_support19                  "Political support in 2019 Indian General Elections"
label variable nrc_support                    "Support for National Register of Citizens in West Bengal?"
label variable attitude_lend_religion         "Willingness to lend money to another religion? (Hindu/Muslim)"
label variable donate_choice                  "Amount donated from Rs 2400"
label variable iat                            "IAT Score"


*-------------------------------*
* Merging with Randomized Teams *
*-------------------------------*

merge 1:1 _id using "$Data/Original/Randomized_Teams.dta",  keep(3) nogen

* Merge with share of same Muslim teammates
merge 1:1 _id using "$Data/Original/PropOldTeammates.dta",  keep(3) nogen

* Encode and generate grouping variables
encode Line_R, generate(line_r)
egen line_sec = group(Line_R new_section)
egen team_sec = group(Line_R team new_section)
egen team_treat = group(Line_R team direct mixed)
generate section_change = (section != new_section)
generate section_change_muslim = section_change if religion == 2
generate section_change_hindu = section_change if religion == 1

*------------------------------*
*  Section Level Aggregates    *
*------------------------------*

** Past Contact at Work **
generate muslim = (religion == 2)
generate hindu = (religion == 1)
egen count_total_line = count(_id), by(Line_R shift)
egen count_total_line_s = count(_id), by(Line_R shift section)
egen count_muslim_line = sum(muslim), by(Line_R shift)
egen count_muslim_line_s = sum(muslim), by(Line_R shift section)
egen count_hindu_line = sum(hindu), by(Line_R shift)
egen count_hindu_line_s = sum(hindu), by(Line_R shift section)

generate share_m_wo = (count_muslim_line - count_muslim_line_s) / (count_total_line - count_total_line_s)
generate share_h_wo = (count_hindu_line - count_hindu_line_s) / (count_total_line - count_total_line_s)
generate mean_contact_hindu_l = share_m_wo if religion == 1
generate mean_contact_muslim_l = share_h_wo if religion == 2

** Contact in Own Section **
egen mean_contact_ls = mean(muslim), by(Line_R shift section)
egen mc = mean(hindu), by(Line_R shift section)
replace mean_contact_ls = mc if religion == 2
generate mean_contact_hindu_ls = mean_contact_ls if religion == 1
generate mean_contact_muslim_ls = mean_contact_ls if religion == 2

** Handle Missing Values **
local contact_vars mean_contact_hindu_l mean_contact_hindu_ls mean_contact_muslim_l mean_contact_muslim_ls
foreach var of local contact_vars {
     replace `var' = . if missing(shift)
}

** Drop Unnecessary Variables **
drop muslim hindu count_total_line count_total_line_s count_muslim_line count_muslim_line_s ///
     count_hindu_line count_hindu_line_s mc

** Standardize Exposure Measures **
foreach var of local contact_vars {
    quietly summarize `var', detail
    generate `var'_std = (`var' - r(mean)) / r(sd)
}

** Tenure by Religion and Skill **
generate tenure_hindu     = tenure if religion == 1
generate tenure_muslim    = tenure if religion == 2
generate tenure_skilled   = tenure if high_skilled
generate tenure_unskilled = tenure if !high_skilled

** Contact Outside Work **
generate hindu_int_cross = int_cross if religion == 1

** Hindu and Muslim IAT Scores **
generate hindu_iat = iat if religion == 1
generate muslim_iat = iat if religion == 2

* Identify Negative IAT Scores for Hindus
summarize hindu_iat, detail
generate hindu_iatneg = (hindu_iat < 0)
replace hindu_iatneg = . if missing(hindu_iat)

* Identify Positive IAT Scores for Muslims
summarize muslim_iat, detail
generate muslim_iatpos = (muslim_iat > 0)
replace muslim_iatpos = . if missing(muslim_iat)

** Communication and Orders by Religion **
generate comm_rel_hindus   = comm_rel if religion == 1
generate comm_rel_muslim   = comm_rel if religion == 2
generate orders_rel_hindus = orders_religion if religion == 1
generate orders_rel_muslim = orders_religion if religion == 2

* Index of Attitudes at Baseline (Muslims)
foreach var of varlist comm_rel_muslim orders_rel_muslim muslim_iat {
    quietly summarize `var', detail
    generate `var'_std = (`var' - r(mean)) / r(sd)
}

* Index of Attitudes at Baseline (Hindus)
foreach var of varlist comm_rel_hindus orders_rel_hindus hindu_iat {
    quietly summarize `var', detail
    generate `var'_std = (`var' - r(mean)) / r(sd)
}

* Create Attitudes Indices
icw_index comm_rel_muslim_std orders_rel_muslim_std muslim_iat_std, generate(b_attitude_muslim)
icw_index comm_rel_hindus_std orders_rel_hindus_std hindu_iat_std, generate(b_attitude_hindu)

** Political (Taste Parameters) **
generate nrc_hindu = (NRC_support == "Yes") if religion == 1
replace nrc_hindu = . if inlist(NRC_support, "", "Dont Know", "Dont Know ", "Refuse to Answer") 

generate bjp_hindu = (political_support_2019 == "bjp") if religion == 1
replace bjp_hindu = . if inlist(political_support_2019, "", "did not vote", "didnt vote", ///
    "uneligible to vote", "na", "refuse to answer", "refused to answer") & religion == 1

** Number of Workers in Each Section **
egen count_workers = count(_id), by(Line_R new_section team)

** Unskilled Workers **
generate unskilled = (skill == 3)  /* Unskilled */

** Caste and Religion **
generate lower_caste = (caste != 1)
replace lower_caste = . if religion == 2
generate hindu = (religion == 1)
generate muslim = (religion == 2)

* Combine Political Attitudes into an Index
foreach var of varlist nrc_hindu bjp_hindu {
    quietly summarize `var', detail
    generate `var'_std = (`var' - r(mean)) / r(sd)
}

* Create Political Attitudes Index
icw_index nrc_hindu_std bjp_hindu_std, generate(pattitudes_hindu)

*------------------------------*
*  Collapse to Aggregated Level *
*------------------------------*

generate n_highskilled_hindu = high_skilled_hindu
generate n_highskilled_muslim = high_skilled_muslim

collapse (mean) tenure tenure_skilled tenure_unskilled unskilled skill high_skilled ///
         high_skilled_hindu high_skilled_muslim comm_rel comm_rel_hindus orders_rel_hindus ///
         comm_rel_muslim orders_rel_muslim b_attitude_muslim b_attitude_hindu ///
         tenure_hindu tenure_muslim mean_contact_hindu_l mean_contact_muslim_l ///
         mean_contact_ls mean_contact_hindu_ls mean_contact_muslim_ls ///
         mean_contact_hindu_l_std mean_contact_hindu_ls_std ///
         mean_contact_muslim_l_std mean_contact_muslim_ls_std school muslim_iat hindu_iat ///
         hindu_int_cross int_cross outgroup_ingroup_r baselineexp_index ///
         baselineexp_index_hindu baselineexp_index_muslim nrc_hindu bjp_hindu ///
         count_workers lower_caste age section_change section_change_hindu ///
         section_change_muslim teammate_old_muslim pattitudes_hindu (sum) ///
         n_highskilled_hindu n_highskilled_muslim (firstnm) LN, ///
         by(Line_R new_section team)

** Count High-Skilled Workers **
generate count_high_skilled = high_skilled * count_workers
generate count_unskilled = count_workers - count_high_skilled

** Share of Skilled Workers **
generate share_hskilled_hindus = n_highskilled_hindu / count_workers
generate share_hskilled_muslims = n_highskilled_muslim / count_workers

* Merge with Share of Muslims in Each Line-Section-Team After Randomization *
merge 1:1 Line_R team new_section using "$Data/Original/linesection_sharemuslims.dta", keep(3) nogen

*------------------------------*
*      Label Aggregated Vars   *
*------------------------------*

label variable school                         "Average team schooling"
label variable age                            "Age"
label variable unskilled                      "Share unskilled in team (self-reported at baseline)"
label variable high_skilled                   "Skill-grade based on HR data (used for randomization)"
label variable high_skilled_hindu             "Share of high-skilled Hindus"
label variable high_skilled_muslim            "Share of high-skilled Muslims"
label variable share_hskilled_hindus          "Share of high-skilled Hindus (all workers)"
label variable share_hskilled_muslims         "Share of high-skilled Muslims (all workers)"
label variable count_high_skilled             "Number of high-skilled workers"
label variable skill                          "Self-reported skill grade"
label variable outgroup_ingroup_r             "Outgroup-Ingroup ratio in locality"
label variable comm_rel                       "Comfort communicating with non-coreligionists (team average)"
label variable comm_rel_hindus                "Communicating with non-coreligionists (Hindus)"
label variable comm_rel_muslim                "Communicating with non-coreligionists (Muslims)"
label variable orders_rel_hindus              "Receiving orders from non-coreligionists (Hindus)"
label variable orders_rel_muslim              "Receiving orders from non-coreligionists (Muslims)"
label variable b_attitude_muslim              "Attitudes Index Muslim"
label variable b_attitude_hindu               "Attitudes Index Hindu"
label variable tenure_hindu                   "Average tenure (Hindus)"
label variable tenure_muslim                  "Average tenure (Muslims)"
label variable tenure_skilled                 "Tenure: Skilled Workers"
label variable tenure_unskilled               "Tenure: Unskilled Workers"
label variable mean_contact_hindu_l           "Baseline exposure (Hindus to Muslims): Other sections"
label variable mean_contact_muslim_l          "Baseline exposure (Muslims to Hindus): Other sections"
label variable mean_contact_ls                "Baseline exposure: In Own section"
label variable mean_contact_hindu_ls          "Baseline exposure (Hindus to Muslims): Own section"
label variable mean_contact_muslim_ls         "Baseline exposure (Muslims to Hindus): Own section"
label variable hindu_iat                      "Average IAT Score (Hindus)"
label variable hindu_int_cross                "Baseline exposure of Hindus to Muslims (outside work)"
label variable int_cross                      "Baseline exposure to non-coreligionists (outside work)"
label variable baselineexp_index_hindu        "Baseline Exposure Index (Hindu)"
label variable baselineexp_index_muslim       "Baseline Exposure Index (Muslim)"
label variable baselineexp_index              "Baseline Exposure Index"
label variable nrc_hindu                      "Average support for NRC (Hindus)"
label variable bjp_hindu                      "Average support for BJP (Hindus)"
label variable count_workers                  "Team Size"
label variable lower_caste                    "Share of lower caste in team"
label variable tenure                         "Tenure"
label variable new_section                    "Section/Task"
label variable team                           "Team"
label variable Line_R                         "Production Line"
label variable LN                             "Production Line (Number)"
label variable section_change                 "Share of section changes"
label variable section_change_muslim          "Share of section changes (Muslim)"
label variable section_change_hindu           "Share of section changes (Hindu)"
label variable teammate_old_muslim            "Share of old Muslim teammates in new team for Hindus"
label variable pattitudes_hindu               "Support for Hindutva Politics (Hindus): Standardized"

*------------------------------*
*  Renaming Teams and Sections *
*------------------------------*

* Renaming Teams Based on Randomization Letters. Line names/product brands have been replaced by numbers. Line_R is the var with product names. 

* Production Line 3
replace team = "X" if team == "A" & LN == "Line 3"
replace team = "Y" if team == "B" & LN == "Line 3"
replace team = "Z" if team == "C" & LN == "Line 3"

* Production Line 1
replace team = "Y" if team == "A" & LN == "Line 1"
replace team = "X" if team == "B" & LN == "Line 1"
replace team = "Z" if team == "C" & LN == "Line 1"

* Production Line 2
replace team = "Y" if team == "A" & LN == "Line 2"
replace team = "Z" if team == "B" & LN == "Line 2"
replace team = "X" if team == "C" & LN == "Line 2"

* Production Line 4
replace team = "Y" if team == "A" & LN == "Line 4"
replace team = "X" if team == "B" & LN == "Line 4"

* Production Line 5
replace team = "X" if team == "A" & LN == "Line 5"
replace team = "Z" if team == "B" & LN == "Line 5"
replace team = "Y" if team == "C" & LN == "Line 5"

* Production Line 6
replace team = "W" if team == "D" & LN == "Line 6"

* Changing Section Names to Merge with Production Data
replace new_section = "Fline" if new_section == "1St Line"
replace new_section = "Sline" if new_section == "2Nd Line"
replace new_section = "Injector" if new_section == "Inject"
replace new_section = "Tray" if inlist(new_section, "Tray Wash", "Tray/Cooling")
replace new_section = "Cooling" if inlist(new_section, "Tray") & Line_R == "Celebration"

*------------------------------*
*    Standardizing Measures    *
*------------------------------*

 * Replace missing values with zeros for standardization for tenure skilled, because many sections have
 * no skilled workers
    foreach var of varlist tenure_skilled {
		qui: sum `var', d
        replace `var' = 0 if missing(`var')
    }

local standard_vars count_workers school_years tenure unskilled high_skilled ///
                    high_skilled_hindu high_skilled_muslim tenure_skilled ///
                    tenure_unskilled share_m age count_high_skilled ///
                    count_unskilled share_hskilled_hindus share_hskilled_muslims

foreach var of local standard_vars {
    quietly summarize `var', detail
    quietly generate `var'_std = (`var' - r(mean)) / r(sd)
}

* Check correct number of line-section-level teams
assert _N == 117

* Save Aggregated Data
save "$Data/Original/Section_Aggregates.dta", replace

